library(stm)
library(stringr)
##################
#Full translation#
##################

load("SnowdenC-noRT.RData")

runout <- translated.docs.mod.15$runout

#Get overall topics for full text
translated.docs.meta$Date <- as.numeric(translated.docs.meta$Date)
sageLabels(runout[[8]])
custom.labs <- apply(sageLabels(runout[[8]], n=7)$marginal$prob, 1, paste, collapse=", ")
pdf("figs/STMPlotFullText.pdf")
plot.STM(runout[[8]], custom.labels=custom.labs, text.cex=.7, xlim=c(0,.3), main="Topics, Full Text Translation")
dev.off()

#Topic 5: human right topic
#Topic 14: asylum topic
#Topic 11: attack
#Topic 3: Snowdon Topic

#Figure 8
#look at relationship between topics and language
prep <- estimateEffect(c(5,14,11)~Language+s(Date),
                       runout[[8]], metadata=translated.docs.meta)
custom.labs2 <- paste(c("Human Rights:", "Asylum:", "Attack:"), custom.labs[c(5,14,11)])
pdf("figs/EffectFullText.pdf")
plot.estimateEffect(prep, "Language", runout[[8]], method="difference",
                    cov.value1="Chinese", cov.value2="Arabic", labeltype="custom",
                    custom.labels=custom.labs2, xlim=c(-.15,.06), main="Topics, Full Text Translation",xlab="Difference in Topic Proportions (Chinese-Arabic)")
dev.off()

#Figures 6 and 7
#Note: This will not work without the full text of the post, which is proprietary.
#pdf("figs/AttackThoughts.pdf")
#out <- findThoughts(runout[[8]], translated.docs.meta$TranslatedText, topics=11, n=1)
#text1 <- translated.docs.meta$TranslatedText[as.numeric(out$index[[1]][1])]
#text11 <- translated.docs.meta$OriginalText[as.numeric(out$index[[1]][1])]
#plot(c(0,0), col="white", xlim=c(1,10), ylim=c(1,9), xaxt="n", yaxt="n", xlab="", ylab="")
#lines(c(5.5,5.5), c(-1,20), lty=2)
#text(3, 5,str_wrap(text1,35))
#len <- str_length(text11)
#or <- str_sub(text11,start=seq(0,len+23, 16), end=seq(15,len+23,16))
#text(8,5,paste(or, collapse="\n"), family="Hei", cex = 1)
#dev.off()

# pdf("figs/HumanRightsThoughts.pdf")

#out <- findThoughts(runout[[8]], translated.docs.meta$TranslatedText, topics=5, n=100)
#text2 <- translated.docs.meta$TranslatedText[as.numeric(out$index[[1]][16])]
#text22 <- translated.docs.meta$OriginalText[as.numeric(out$index[[1]][16])]
#plot(c(0,0), col="white", xlim=c(1,10), ylim=c(1,9), xaxt="n", yaxt="n", xlab="", ylab="")
#lines(c(5.5,5.5), c(-1,20), lty=2)
#text(3, 5,str_wrap(text2,35))
#len <- str_length(text22)
#or <- str_sub(text22,start=seq(0,len+23, 16), end=seq(15,len+23,16))
#text(8,5,paste(or, collapse="\n"), family="Hei", cex = 1)
# dev.off()

pdf("figs/SnowdonContentCovariate.pdf")
#Figure 5
#Hist to compare Snowdon topics with a content covariate and Snowdon without
#content covariate
for(i in 3){
  chinese <- runout[[8]]$theta[translated.docs.meta$Language=="Chinese",i]
  arabic <-  runout[[8]]$theta[translated.docs.meta$Language=="Arabic",i]
  chist <- hist(chinese, plot=F,20)
  ahist <- hist(arabic, plot=F,40)
  par(mfrow=c(2,1))
  plot(chist, ylim=c(0,max(c(chist$counts, ahist$counts))),
       xlab="Expected Topic Proportion", xlim=c(0,1),
       main="Snowdon Topic, Content Covariate, Chinese",cex.main=1.5,
       cex.lab=1.5, density=25, angle=45)
  plot(ahist,  density=25, angle=1,
       main="Snowdon Topic, Content Covariate, Arabic",
       xlab="Expected Topic Proportion",ylim=c(0,max(c(chist$counts,
                                          ahist$counts))),xlim=c(0,1),
       cex.lab=1.5, cex.main=1.5)
}
dev.off()
##########################################
#Full translation w/out content covariate#
##########################################

load("SnowdennoC-noRT.RData")

runout <- translated.docs.mod.15$runout

translated.docs.meta$Date <- as.numeric(translated.docs.meta$Date)
sageLabels(runout[[4]])

#Topic 8: Snowdon topic
#Figure 5
#Hist to compare Snowdon with a content covariate and Snowdon without
pdf("figs/SnowdonNoContentCovariate.pdf")
for(i in 8){
  chinese <- runout[[4]]$theta[translated.docs.meta$Language=="Chinese",i]
  arabic <-  runout[[4]]$theta[translated.docs.meta$Language=="Arabic",i]
  chist <- hist(chinese, plot=F, 40)
  ahist <- hist(arabic, plot=F,15)
  par(mfrow=c(2,1))
  plot(chist, ylim=c(0,max(c(chist$counts, ahist$counts))),
       xlab="Expected Topic Proportion", xlim=c(0,1),
       main="Snowdon Topic, No Content Covariate, Chinese",cex.main=1.5,
       cex.lab=1.5, density=25, angle=45)
  plot(ahist,  density=25, angle=1,
       main="Snowdon Topic, No Content Covariate, Arabic",
       xlab="Expected Topic Proportion",ylim=c(0,max(c(chist$counts,
                                          ahist$counts))),xlim=c(0,1),
       cex.lab=1.5, cex.main=1.5)
}
dev.off()

##########################
#Term-by-term translation#
##########################

load("SnowdenC-noRT-TermByTerm.RData")

runout <- translated.docs.mod.15$runout

#Look at overall topics
translated.docs.meta$Date <- as.numeric(translated.docs.meta$Date)
sageLabels(runout[[3]])
custom.labs <- apply(sageLabels(runout[[3]], n=7)$marginal$prob, 1, paste, collapse=", ")
pdf("figs/STMPlotTermByTerm.pdf")
plot.STM(runout[[3]], custom.labels=custom.labs, text.cex=.7, xlim=c(0,.3), main="Topics, Term-by-Term Translation")
dev.off()

#Topic 1: human rights topic
#Topic 2: asylum
#Topic 12: attack

#Figure 8
custom.labs2 <- paste(c("Human Rights:", "Asylum:", "Attack:"), custom.labs[c(1,2,12)])
prep <- estimateEffect(c(1,2,12)~OriginalLanguage+s(Date),
                       runout[[3]], metadata=translated.docs.meta)

pdf("figs/EffectTermByTerm.pdf")
plot.estimateEffect(prep, "OriginalLanguage", runout[[3]], method="difference",
                    cov.value1="Chinese", cov.value2="Arabic", labeltype="custom",
                    custom.labels=custom.labs2, xlim=c(-.2,.06),
                    main="Topics, Term-by-Term Translation", xlab="Difference in Topic Proportions (Chinese-Arabic)")
dev.off()

##################################
#Make similarity plot (Figure 10)#
##################################
load("SnowdenC-noRT-TermByTerm.RData")
z1 <- translated.docs.mod.15$runout[[3]]
meta1 <- translated.docs.meta
load("SnowdenC-noRT.RData")
z <- translated.docs.mod.15$runout[[8]]
meta <- translated.docs.meta

marginalbeta <- function(model){
    logbeta <- model$beta$logbeta
      K <- model$settings$dim$K
      vocab <- model$vocab
      margbeta <- exp(logbeta[[1]])

      weights <- model$settings$covariates$betaindex
      tab <- table(weights)
      weights <- tab/sum(tab)
      margbeta <- margbeta * weights[1]
      for (i in 2:length(model$beta$logbeta)) {
            margbeta <- margbeta + exp(model$beta$logbeta[[i]]) *
                    weights[i]
          }
      return(margbeta)
  }

library(corrplot)

translated.thetas <- as.data.frame(z$theta)
translated.thetas$GUID <- meta$GUID
term.thetas <- as.data.frame(z1$theta)
term.thetas$GUID <- meta1$GUID
merged <- merge(translated.thetas, term.thetas, by.x='GUID',
                                by.y='GUID')

mod1theta <- as.matrix(merged[,2:16])
mod2theta <- as.matrix(merged[,17:31])
labels1 <- apply(sageLabels(z, n=3)$marginal$prob, 1, paste, collapse=",")
labels2 <- apply(sageLabels(z1, n=3)$marginal$prob,1,paste,collapse=",")
correlations <- cor(mod1theta, mod2theta)
rownames(correlations) <- labels1
colnames(correlations) <- labels2

pdf('figs/theta_corr_plot.pdf')
corrplot(correlations)
dev.off()






